from urllib.request import urlopen  # python自带爬虫库
import pandas as pd
import time
import re  # 正则表达式库
import os  # 系统库
import json  # python自带的json数据库

# 从网页上抓取数据
def get_content_from_internet(url, max_try_num=10, sleep_time=5):
    get_success = False  # 是否成功抓取到内容

    for i in range(max_try_num):
        try:
            content = urlopen(url=url, timeout=10).read()  # 使用python自带的库，从网络上获取信息
            get_success = True  # 成功抓取到内容
            break
        except Exception as e:
            print('抓取数据报错，次数：', i + 1, '报错内容：', e)
            time.sleep(sleep_time)

    # 判断是否成功抓取内容
    if get_success:
        return content
    else:
        raise ValueError('使用urlopen抓取网页数据不断报错，达到尝试上限，停止程序，请尽快检查问题所在')

def get_market_cap():
    raw_url = 'http://vip.stock.finance.sina.com.cn/quotes_service/api/json_v2.php/Market_Center.getHQNodeData?page=%s' \
              '&num=80&sort=symbol&asc=1&node=hs_a&symbol=&_s_r_a=sort'
    page_num = 1

    all_df = pd.DataFrame()
    # ===开始逐页遍历，获取股票数据
    while True:
        # 构建url
        url = raw_url % (page_num)
        print('开始抓取页数：', page_num)

        # 抓取数据
        content = get_content_from_internet(url)
        content = content.decode('gbk')

        # 判断页数是否为空
        if '[]' in content or 'null' in content:
            print('抓取到页数的尽头，退出循环')
            break

        # 通过正则表达式，给key加上引号
        content = re.sub(r'(?<={|,)([a-zA-Z][a-zA-Z0-9]*)(?=:)', r'"\1"', content)

        # 将数据转换成dict格式
        content = json.loads(content)

        # 将数据转换成DataFrame格式
        df = pd.DataFrame(content, dtype='float')
        df.rename(columns={'symbol': '股票代码', 'name': '股票名称', 'mktcap': '总市值'}, inplace=True)
        df = df[['股票代码', '股票名称', '总市值']]
        df['总市值'] = round(df['总市值'] * 10000, 2)
        # 合并数据
        all_df = all_df.append(df, ignore_index=True)
        # 将页数+1
        page_num += 1
        time.sleep(1)

    # ===返回结果
    return all_df

df = get_market_cap()
print(df[df['股票代码'].str.contains('sh688')])
sh688 = df[df['股票代码'].str.contains('sh688')]
bj = df[df['股票代码'].str.contains('bj')]
st = df[(df['股票名称'].str.contains('ST')) | (df['股票代码'].str.contains('sh688'))]
# df = df.drop(sh688.index)
df = df.drop(st.index)
df = df.drop(bj.index)

df = df.sort_values(by=['总市值'], ascending=1)

now = time.localtime()
nowt = time.strftime("%Y-%m-%d %H:%M:%S", now)
filename_day = time.strftime("%Y-%m-%d", now)
print("更新时间：", nowt)

df = df.head(10)

# === 以下数据更新方式二选一
# === 1、每次执行会生成以日期命名的csv文件
all_data_path = os.path.abspath(os.path.dirname(__file__))  # 返回当前文件路径
path = all_data_path+'/data/market_cap_last10/' + filename_day + '.csv'
pd.DataFrame(columns=['更新时间：' + nowt]).to_csv(path, index=False, encoding='gbk')
df.to_csv(path, index=False, mode='a', encoding='gbk')

# === 2、每次执行会追加最新数据，不再覆盖
# path = r'C:\Users\wanbabi\Desktop\xbx_stock_2019\data\market_cap_last10\market_cap_last10.csv'
# ut = pd.DataFrame(columns=['更新时间：' + nowt])
# df = df.append(ut)
# df.to_csv(path, index=False, mode='a', encoding='gbk')